In [1]:
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import warnings
import matplotlib
import pandas as pd
import scipy.stats as stats
import data_dictionary as dd

warnings.filterwarnings('ignore')

#%load_ext autoreload
#%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
pwd


Out[2]:
'C:\\Users\\gary\\Documents\\aaaRepos\\machine_learning\\co-presence'

In [3]:
DATA_DIRECTORY = "C:\\Users\\gary\\Documents\\data"

In [4]:
ls $DATA_DIRECTORY


 Volume in drive C is OS
 Volume Serial Number is F8CA-7CC7

 Directory of C:\Users\gary\Documents\data

07/03/2017  08:42 AM    <DIR>          .
07/03/2017  08:42 AM    <DIR>          ..
07/02/2017  08:15 AM    <DIR>          atusact_2016
07/01/2017  09:25 AM         3,684,229 atusact_2016.zip
07/02/2017  08:15 AM    <DIR>          atuscps_2016
07/01/2017  09:26 AM         6,860,353 atuscps_2016.zip
07/02/2017  09:28 AM           864,474 atuscpscodebk16.pdf
07/02/2017  09:29 AM           570,216 atusintcodebk16.pdf
07/01/2017  09:35 AM    <DIR>          atusresp_2016
07/01/2017  09:25 AM           841,679 atusresp_2016.zip
07/02/2017  08:15 AM    <DIR>          atusrost_2016
07/01/2017  09:25 AM           144,334 atusrost_2016.zip
07/02/2017  08:17 AM    <DIR>          atusrostec_2016
07/01/2017  09:26 AM            23,198 atusrostec_2016.zip
07/02/2017  08:17 AM    <DIR>          atussum_2016
07/01/2017  09:25 AM           713,076 atussum_2016.zip
07/02/2017  08:17 AM    <DIR>          atuswho_2016
07/01/2017  09:25 AM           841,409 atuswho_2016.zip
07/02/2017  04:16 PM    <DIR>          cps_00001.csv
07/02/2017  04:15 PM         1,215,296 cps_00001.csv.gz
07/02/2017  06:44 PM            98,267 freqvariables.pdf
07/02/2017  09:26 AM            70,656 hinc01_1.xls
07/03/2017  08:41 AM           285,378 lexiconnoex2016.pdf
07/03/2017  08:42 AM           797,090 lexiconwex2016.pdf
07/02/2017  08:00 PM           352,873 tu2016coderules.pdf
07/02/2017  06:44 PM           833,607 tuquestionnaire.pdf
              16 File(s)     18,196,135 bytes
              10 Dir(s)  668,077,223,936 bytes free

In [5]:
filename = DATA_DIRECTORY + '/atusact_2016/atusact_2016.dat'
resp = pd.read_csv(filename, usecols=dd.ACTIVITY_FIELDS)
resp.describe()


Out[5]:
TUCASEID TUACTIVITY_N TEWHERE TUACTDUR TUTIER1CODE TUTIER2CODE TUTIER3CODE
count 2.072130e+05 207213.000000 207213.000000 207213.000000 207213.000000 207213.000000 207213.000000
mean 2.016063e+13 12.085646 4.095235 81.298866 8.797860 3.514997 3.517762
std 3.473787e+08 8.580891 7.449853 125.492770 7.677521 7.291834 10.728536
min 2.016010e+13 1.000000 -1.000000 1.000000 1.000000 1.000000 1.000000
25% 2.016030e+13 5.000000 1.000000 15.000000 2.000000 1.000000 1.000000
50% 2.016061e+13 11.000000 1.000000 30.000000 11.000000 2.000000 1.000000
75% 2.016091e+13 17.000000 8.000000 90.000000 12.000000 3.000000 3.000000
max 2.016121e+13 81.000000 89.000000 1440.000000 50.000000 99.000000 99.000000

In [6]:
resp.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207213 entries, 0 to 207212
Data columns (total 9 columns):
TUCASEID        207213 non-null int64
TUACTIVITY_N    207213 non-null int64
TEWHERE         207213 non-null int64
TUACTDUR        207213 non-null int64
TUSTARTTIM      207213 non-null object
TUSTOPTIME      207213 non-null object
TUTIER1CODE     207213 non-null int64
TUTIER2CODE     207213 non-null int64
TUTIER3CODE     207213 non-null int64
dtypes: int64(7), object(2)
memory usage: 14.2+ MB

In [7]:
list(resp)


Out[7]:
['TUCASEID',
 'TUACTIVITY_N',
 'TEWHERE',
 'TUACTDUR',
 'TUSTARTTIM',
 'TUSTOPTIME',
 'TUTIER1CODE',
 'TUTIER2CODE',
 'TUTIER3CODE']

In [13]:
resp.TUACTDUR.max()


Out[13]:
1440

In [22]:
resp222 = resp.loc[(resp['TUTIER1CODE'] == 2) & (resp['TUTIER2CODE'] == 2) & (resp['TUTIER3CODE'] == 2)]

In [23]:
len(resp222)


Out[23]:
307

In [24]:
resp222.describe()


Out[24]:
TUCASEID TUACTIVITY_N TEWHERE TUACTDUR TUTIER1CODE TUTIER2CODE TUTIER3CODE
count 3.070000e+02 307.000000 307.000000 307.000000 307.0 307.0 307.0
mean 2.016061e+13 16.332248 1.296417 15.381107 2.0 2.0 2.0
std 3.577121e+08 9.588274 1.452996 20.665592 0.0 0.0 0.0
min 2.016010e+13 2.000000 1.000000 1.000000 2.0 2.0 2.0
25% 2.016030e+13 8.500000 1.000000 5.000000 2.0 2.0 2.0
50% 2.016060e+13 16.000000 1.000000 10.000000 2.0 2.0 2.0
75% 2.016091e+13 22.000000 1.000000 15.000000 2.0 2.0 2.0
max 2.016121e+13 46.000000 12.000000 225.000000 2.0 2.0 2.0

Print data definitions


In [ ]:
for code1, dict1 in dd.ACTIVITY_CODE.items():
    for name1, dict2 in dict1.items():
        print(code1, name1)
        for code2, dict3 in dict2.items():
            for name2, dict4 in dict3.items():
                print( '\t', code2, name2)
                for code3, name3 in dict4.items():
                    print('\t\t', code3, name3)

In [ ]:
for code1, name1 in dd.ACTIVITY_WHERE.items():
        print(code1, name1)

In [ ]:
for code1, name1 in dd.ACTIVITY_WHO.items():
        print(code1, name1)

In [ ]: